#delim cr
set more off
*version 11
pause on
graph set ps logo off

capture log close
set linesize 200
set logtype text
log using ../log/clean-time-card-data.log , replace

/* --------------------------------------

Study the time-card data.

--------------------------------------- */

clear all
estimates clear

foreach year in 2010 2011 2012 2013 {

	disp ""
	disp ""
	disp ""
	disp "Now bring in time card data for year `year'"

	insheet using ../src/time-card-data/time-card-record-`year'.txt , comma names clear

	d, f
	sum
	list in 9000/9020

	************************************************************
	**   Create date and time
	************************************************************

	disp ""
	disp ""
	disp ""
	disp "Now clean time variables, `year'"

	gen year = real( substr(timerecord, 1, 4 ) )
	tab year , miss

	gen month = real( substr(timerecord, 6, 2) )
	tab month , miss

	gen day = real( substr(timerecord, 9, 2) )
	tab day , miss

	gen hour = real( substr(timerecord, 12, 2) )
	tab hour , miss

	gen minute = real( substr(timerecord, 15, 2) )
	tab minute , miss

	gen seconds = real( substr(timerecord, 18, 2) )
	tab seconds , miss

	gen date = mdy( month, day, year)
	format date %td
	codebook date

	gen fraction_of_day = (hour / 24) + (minute / (60 * 24)) + (seconds / (60 * 60 * 24))
	sum fraction_of_day

	************************************************************
	**   Describe data
	************************************************************

	disp ""
	disp ""
	disp ""
	disp "Now describe data for year `year'"

	** Just show the data
	disp "Just list raw data, year `year'"
	sort empcode date fraction_of_day
	list date fraction_of_day timerecord empcode in 1/400 , sepby(date empcode)

	** Describe two workers:
	disp "Describe the experience of a few workers, year `year'"
	sort empcode date
	list date fraction_of_day timerecord if empcode == "S13390" , sepby(date)
	list date fraction_of_day timerecord if empcode == "S17778" , sepby(date)

	** Describe observations per day per worker
	bysort empcode date: gen obs_per_day = _N
	tab obs_per_day
	drop obs_per_day

	** Explore typical show-up times
	disp "Describe when people show up to work, year `year'"
	sort empcode date fraction_of_day
	by empcode date: gen first_ob_of_day = (_n == 1)
	tab hour if first_ob_of_day == 1
	drop first_ob_of_day

	************************************************************
	**   Clean the worker identifiers
	************************************************************

	disp ""
	disp ""
	disp ""
	disp "Clean worker identifiers for year `year'"

	** First, note that we keep the employee code as upper-case
	codebook empcode
	replace empcode = upper(empcode)
	codebook empcode

	** Describe problematic employer codes:
	preserve
		bysort empcode: gen days_worked = _N
		bysort empcode: keep if _n == 1
		gsort -days_worked
		list empcode days_worked in 1/20
	restore

	drop if empcode == ""
	drop if empcode == "N0ULL"

	************************************************************
	**   Reshape the data
	************************************************************

	disp ""
	disp ""
	disp ""
	disp "Reshape data for year `year'"

	** We want to save multiple time-stamps per worker per day

	gsort empcode date +fraction_of_day
	by empcode date: gen login = _n
	tab login

	** There is a tiny share of workers who log in more than twice a day.
	** We drop those.
	drop if login > 4

	list empcode date hour minute in 9000/9100 , sepby(empcode date)

	egen worker_date = group(empcode date)
	sum worker_date

	keep date empcode hour minute worker_date login
	reshape wide hour minute , i(worker_date) j(login)
	drop worker_date 

	************************************************************
	**   Save & Close
	************************************************************

	disp ""
	disp ""
	disp ""
	disp "Save & close out year `year'"

	** These variable names are consistent with what we have in the
	** main dataset.
	rename empcode EmployeeID
	rename date Date

	compress
	tempfile piece_`year'
	save `piece_`year''
}

************************************************************
**   Stack the pieces together
************************************************************

clear
foreach year in 2010 2011 2012 2013 {
	append using `piece_`year''
}

************************************************************
**   Save out a copy
************************************************************

d, f
isid EmployeeID Date

compress
save ../dta/reshaped-time-stamps.dta , replace

************************************************************
**   Create histogram of first time stamp of the day
************************************************************

preserve

	gen clock_in_time = hour1 + (minute1 / 60)

	local inv_golden_ratio = 2 / ( sqrt(5) + 1 )
	graph set window fontface "Garamond" 
	histogram clock_in_time , fcolor(gs10) ///
	ylabel(, nogrid angle(horizontal)  ) ///
	scheme(s2mono) ///
	graphregion(fcolor(white)) ///
	xline(8.00) ///
	aspectratio(`inv_golden_ratio') ///
	legend(off) ///
	yscale( nofextend ) xscale(nofextend) ///
	xtitle("Time of arrival at work") ytitle("Worker-Days") 
	graph save ../gph/histogram-clock_in_time.gph , replace

restore



log close
exit

